#!/usr/bin/env python

import os, re, util2, codecs, trans_langs, bz2, zlib

class Lang(object):
    def __init__(self, desc):
        assert len(desc) <= 4
        self.desc = desc
        self.code = desc[0] # "af"
        self.name = desc[1] # "Afrikaans"
        self.ms_lang_id = desc[2]
        self.isRtl = False
        if len(desc) > 3:
            assert desc[3] == 'RTL'
            self.isRtl = True

        # code that can be used as part of C identifier i.e.:
        # "ca-xv" => "ca_xv"
        self.code_safe = self.code.replace("-", "_")
        self.c_translations_array_name = "gTranslations_" + self.code_safe
        self.translations = []

def get_lang_objects(langs_defs):
    return [Lang(desc) for desc in langs_defs]

# number of missing translations for a language to be considered
# incomplete (will be excluded from Translations_txt.cpp) as a
# percentage of total string count of that specific file
INCOMPLETE_MISSING_THRESHOLD = 0.2

SRC_DIR = os.path.join(os.path.dirname(__file__), "..", "src")

C_DIRS_TO_PROCESS = [".", "installer", "browserplugin"]
# produce a simpler format for these dirs
C_SIMPLE_FORMAT_DIRS = ["browserplugin"]

def should_translate(file_name):
    file_name = file_name.lower()
    return file_name.endswith(".cpp")

C_FILES_TO_PROCESS = []
for dir in C_DIRS_TO_PROCESS:
    d = os.path.join(SRC_DIR, dir)
    C_FILES_TO_PROCESS += [os.path.join(d, f) for f in os.listdir(d) if should_translate(f)]

TRANSLATION_PATTERN = r'\b_TRN?\("(.*?)"\)'

def extract_strings_from_c_files(with_paths=False):
    strings = []
    for f in C_FILES_TO_PROCESS:
        file_content = open(f, "r").read()
        file_strings = re.findall(TRANSLATION_PATTERN, file_content)
        if with_paths:
            strings += [(s, os.path.basename(os.path.dirname(f))) for s in file_strings]
        else:
            strings += file_strings
    return util2.uniquify(strings)

TRANSLATIONS_TXT_SIMPLE = """\
/*
 DO NOT EDIT MANUALLY !!!
 Generated by scripts\\trans_gen.py
*/

#ifndef MAKELANGID
#include <windows.h>
#endif

int gTranslationsCount = %(translations_count)d;

const WCHAR * const gTranslations[] = {
%(translations)s
};

const char * const gLanguages[] = {
    %(langs_list)s
};

// from http://msdn.microsoft.com/en-us/library/windows/desktop/dd318693(v=vs.85).aspx
// those definition are not present in 7.0A SDK my VS 2010 uses
#ifndef LANG_CENTRAL_KURDISH
#define LANG_CENTRAL_KURDISH 0x92
#endif

#ifndef SUBLANG_CENTRAL_KURDISH_CENTRAL_KURDISH_IRAQ
#define SUBLANG_CENTRAL_KURDISH_CENTRAL_KURDISH_IRAQ 0x01
#endif

// note: this index isn't guaranteed to remain stable over restarts, so
// persist gLanguages[index/gTranslationsCount] instead
int GetLanguageIndex(LANGID id)
{
    switch (id) {
#define _LANGID(lang) MAKELANGID(lang, SUBLANG_NEUTRAL)
    %(lang_id_to_index)s
#undef _LANGID
    }
}

bool IsLanguageRtL(int index)
{
    return %(rtl_lang_cmp)s;
}
"""

# use octal escapes because hexadecimal ones can consist of
# up to four characters, e.g. \xABc isn't the same as \253c
def c_oct(c):
    o = "00" + oct(ord(c))
    return "\\" + o[-3:]

def c_escape(txt):
    if txt is None:
        return "NULL"
    # escape all quotes
    txt = txt.replace('"', r'\"')
    # and all non-7-bit characters of the UTF-8 encoded string
    txt = re.sub(r"[\x80-\xFF]", lambda m: c_oct(m.group(0)[0]), txt)
    return '"%s"' % txt

def c_escape_for_compact(txt):
    if txt is None:
        return '"\\0"'
    # escape all quotes
    txt = txt.replace('"', r'\"')
    # and all non-7-bit characters of the UTF-8 encoded string
    txt = re.sub(r"[\x80-\xFF]", lambda m: c_oct(m.group(0)[0]), txt)
    return '"%s\\0"' % txt

def get_trans_for_lang(strings_dict, keys, lang_arg):
    if lang_arg == "en":
        return keys
    trans, untrans = [], []
    for k in keys:
        found = [tr for (lang, tr) in strings_dict[k] if lang == lang_arg]
        if found:
            assert len(found) == 1
            # don't include a translation, if it's the same as the default
            if found[0] == k:
                found[0] = None
            trans.append(found[0])
        else:
            trans.append(None)
            untrans.append(k)
    if len(untrans) > INCOMPLETE_MISSING_THRESHOLD * len(keys):
        return None
    return trans

def lang_sort_func(x,y):
    # special case: default language is first
    if x[0] == "en": return -1
    if y[0] == "en": return 1
    return cmp(x[1], y[1])

# correctly sorts strings containing escaped tabulators
def key_sort_func(a, b):
    return cmp(a.replace(r"\t", "\t"), b.replace(r"\t", "\t"))

g_incomplete_langs = None

def build_trans_for_langs(langs, strings_dict, keys):
    global g_incomplete_langs
    g_incomplete_langs = []
    for lang in langs:
        lang.translations = get_trans_for_lang(strings_dict, keys, lang.code)
        if not lang.translations:
            g_incomplete_langs.append(lang)
    for lang in g_incomplete_langs:
        langs.remove(lang)
    return langs

compact_c_tmpl = """\
/*
 DO NOT EDIT MANUALLY !!!
 Generated by scripts\\trans_gen.py
*/

#include "BaseUtil.h"

namespace trans {

#define LANGS_COUNT   %(langs_count)d
#define STRINGS_COUNT %(translations_count)d

const char *gOriginalStrings[STRINGS_COUNT] = {
%(orignal_strings)s
};

const char **GetOriginalStrings() { return &gOriginalStrings[0]; }

%(translations)s

const char *gLangCodes = \
%(langcodes)s "\\0";

const char *gLangNames = \
%(langnames)s "\\0";

// from http://msdn.microsoft.com/en-us/library/windows/desktop/dd318693(v=vs.85).aspx
// those definition are not present in 7.0A SDK my VS 2010 uses
#ifndef LANG_CENTRAL_KURDISH
#define LANG_CENTRAL_KURDISH 0x92
#endif

#ifndef SUBLANG_CENTRAL_KURDISH_CENTRAL_KURDISH_IRAQ
#define SUBLANG_CENTRAL_KURDISH_CENTRAL_KURDISH_IRAQ 0x01
#endif

#define _LANGID(lang) MAKELANGID(lang, SUBLANG_NEUTRAL)
const LANGID gLangIds[LANGS_COUNT] = {
%(langids)s
};
#undef _LANGID

bool IsLangRtl(int idx)
{
  %(islangrtl)s
}

int gLangsCount = LANGS_COUNT;
int gStringsCount = STRINGS_COUNT;

const LANGID *GetLangIds() { return &gLangIds[0]; }

} // namespace trans
"""

# generate unique names for translations files for each binary, to simplify build
def file_name_from_dir_name(dir_name):
    if dir_name == ".":
        return "Trans_sumatra_txt.cpp"
    return "Trans_%s_txt.cpp" % dir_name

def build_translations(langs):
    for lang in langs[1:]:
        c_escaped = []
        seq = ""
        for t in lang.translations:
            c_escaped.append("  %s" % c_escape_for_compact(t))
            if t != None:
                seq += t
            seq += "\0"
        lang.c_escaped_lines = c_escaped
        lang.seq = seq
        lang.seq_zip = zlib.compress(seq, 9)
        lang.seq_bzip = bz2.compress(seq, 9)

def gen_translations(langs):
    lines = []
    for lang in langs[1:]:
        s = "\\\n".join(lang.c_escaped_lines)
        lines.append("const char * %s = \n%s;\n" % (lang.c_translations_array_name, s))
    return "\n".join(lines)

def gen_trans_compressed_for_lang(lang):
    lines = []
    per_line = 24
    rest = lang.seq_zip
    while len(rest) > 0:
        tmp = [str(ord(c)) for c in rest[:per_line]]
        lines.append(",".join(tmp))
        rest = rest[per_line:]
    s = ",\n  ".join(lines)
    return "static unsigned char %s[] = {\n  %s\n};\n" % (lang.c_translations_array_name, s)

compressed_tmpl = """
%(translations)s

// for each lang: uncompressed size, compressed size
const uint32_t gLangsCompressionInfo[LANGS_COUNT*2] = {
%(compressed_sizes)s
};

static const unsigned char *gTranslations[LANGS_COUNT] = {
%(translations_refs)s
};

const unsigned char *GetTranslationsForLang(int langIdx, uint32_t *uncompressedSizeOut, uint32_t *compressedSizeOut) {
    *uncompressedSizeOut = gLangsCompressionInfo[langIdx*2];
    *compressedSizeOut = gLangsCompressionInfo[langIdx*2+1];
    return gTranslations[langIdx];
}
"""

uncompressed_tmpl = """
%(translations)s

static const char *gTranslations[LANGS_COUNT] = {
%(translations_refs)s
};

const char *GetTranslationsForLang(int langIdx) { return gTranslations[langIdx]; }
"""

def gen_translations_compressed(langs):
    lines = []
    sizes = ["0", "0"]
    for lang in langs[1:]:
        lines.append(gen_trans_compressed_for_lang(lang))
        sizes.append(str(len(lang.seq)))
        sizes.append(str(len(lang.seq_zip)))
    translations = "\n".join(lines)
    compressed_sizes = "  " + ", ".join(sizes)
    return (translations, compressed_sizes)

# what percentage of total is x (x=60 is 60% of total=100)
def perc(total, x): return x * 100.0 / total

def print_stats(langs):
    uncompressed = 0
    compressed_zip = 0
    compressed_bzip = 0
    for lang in langs[1:]:
        uncompressed += len(lang.seq)
        compressed_zip += len(lang.seq_zip)
        compressed_bzip += len(lang.seq_bzip)
    pzip = perc(uncompressed, compressed_zip)
    pbzip = perc(uncompressed, compressed_bzip)
    savezip = uncompressed - compressed_zip
    savebzip = uncompressed - compressed_bzip
    savebzip_over_zip = savebzip - savezip
    vals = (uncompressed, compressed_zip, pzip, savezip, compressed_bzip, pbzip, savebzip, savebzip_over_zip)
    print("\nLen: %d zip: %d %.2f%% (-%d), bzip: %d %.2f%% (-%d), bzip over zip: %d" % vals)

def print_incomplete_langs(dir_name):
    langs = ", ".join([lang.code for lang in g_incomplete_langs])
    count = "%d out of %d" % (len(g_incomplete_langs), len(trans_langs.g_langs))
    print("\nIncomplete langs in %s: %s %s" % (file_name_from_dir_name(dir_name), count, langs))

def gen_c_code_for_dir(strings_dict, keys, dir_name, compressed=False):
    langs = get_lang_objects(sorted(trans_langs.g_langs, cmp=lang_sort_func))
    assert "en" == langs[0].code
    langs = build_trans_for_langs(langs, strings_dict, keys)

    langcodes = " \\\n".join(["  %s" % c_escape_for_compact(lang.code) for lang in langs])

    langnames = " \\\n".join(["  %s" % c_escape_for_compact(lang.name) for lang in langs])
    langids = ",\n".join(["  %s" % lang.ms_lang_id for lang in langs])

    rtl_info = ["(%d == idx)" % langs.index(lang) for lang in langs if lang.isRtl]
    islangrtl = "return %s;" % (" || ".join(rtl_info) or "false")

    build_translations(langs)

    translations_refs = "  NULL,\n" + ", \n".join(["  %s" % lang.c_translations_array_name for lang in langs[1:]])
    if compressed:
        (translations, compressed_sizes) = gen_translations_compressed(langs)
        translations = compressed_tmpl % locals()
    else:
        translations = gen_translations(langs)
        translations = uncompressed_tmpl % locals()

    lines = ["  %s" % c_escape(t) for t in langs[0].translations]
    orignal_strings = ",\n".join(lines)

    langs_count = len(langs)
    translations_count = len(keys)
    file_content = compact_c_tmpl % locals()
    file_path = os.path.join(SRC_DIR, dir_name, file_name_from_dir_name(dir_name))
    file(file_path, "wb").write(file_content)

    print_incomplete_langs(dir_name)
    #print_stats(langs)

def gen_c_code_simple(strings_dict, keys, dir_name):
    langs = get_lang_objects(sorted(trans_langs.g_langs, cmp=lang_sort_func))
    assert "en" == langs[0].code
    langs = build_trans_for_langs(langs, strings_dict, keys)

    lines = []
    for lang in langs:
        lines.append('  /* Translations for language %s */' % lang.code)
        lines += ['  L"%s",' % t.replace('"', '\\"') if t else '  NULL,' for t in lang.translations]
        lines.append("")
    lines.pop()
    translations = "\n".join(lines)

    langs_grp = ['"%s"' % lang.code for lang in langs] + ["NULL"]
    langs_list = ",\n    ".join([", ".join(grp) for grp in util2.group(langs_grp, 10)])
    lang_id_to_index = "\n    ".join(["case %s: return %d;" % (lang.ms_lang_id, langs.index(lang) * len(keys)) for lang in langs if lang.ms_lang_id != "(LANGID)-1"] + ["default: return -1;"])
    rtl_lang_cmp = " || ".join(["%d == index" % (langs.index(lang) * len(keys)) for lang in langs if lang.isRtl]) or "false"

    translations_count = len(keys)
    file_content = codecs.BOM_UTF8 + TRANSLATIONS_TXT_SIMPLE % locals()
    file_name = os.path.join(SRC_DIR, dir_name, file_name_from_dir_name(dir_name))
    file(file_name, "wb").write(file_content)

    print_incomplete_langs(dir_name)

def gen_c_code(strings_dict, strings):
    for dir in C_DIRS_TO_PROCESS:
        keys = [s[0] for s in strings if s[1] == dir and s[0] in strings_dict]
        keys.sort(cmp=key_sort_func)
        if dir not in C_SIMPLE_FORMAT_DIRS:
            gen_c_code_for_dir(strings_dict, keys, dir)
        else:
            gen_c_code_simple(strings_dict, keys, dir)

def main():
    import trans_download
    trans_download.regenerateLangs()

if __name__ == "__main__":
    main()
